{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "80900a36-790d-461d-872a-90884c0c7383",
   "metadata": {},
   "source": [
    "# Data Labeling using GPT-3 for Portuguese"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d61ca08-5fe1-49af-a325-5ba39ebad2b5",
   "metadata": {},
   "source": [
    "This study aims to labeling data in portuguese for sentiment analysis task. We use the e-commerce review dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca384797-7c0c-4228-b26e-b8d80f31f650",
   "metadata": {},
   "source": [
    "### Install packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9c4f7434-26da-49ca-8762-086a3b193be2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: openai in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (0.10.4)\n",
      "Requirement already satisfied: openpyxl>=3.0.7 in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from openai) (3.0.9)\n",
      "Requirement already satisfied: pandas>=1.2.3 in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from openai) (1.2.4)\n",
      "Requirement already satisfied: tqdm in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from openai) (4.49.0)\n",
      "Requirement already satisfied: pandas-stubs>=1.1.0.11 in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from openai) (1.2.0.17)\n",
      "Requirement already satisfied: requests>=2.20 in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from openai) (2.25.1)\n",
      "Requirement already satisfied: et-xmlfile in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from openpyxl>=3.0.7->openai) (1.1.0)\n",
      "Requirement already satisfied: numpy>=1.16.5 in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from pandas>=1.2.3->openai) (1.20.3)\n",
      "Requirement already satisfied: python-dateutil>=2.7.3 in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from pandas>=1.2.3->openai) (2.8.1)\n",
      "Requirement already satisfied: pytz>=2017.3 in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from pandas>=1.2.3->openai) (2021.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from pandas-stubs>=1.1.0.11->openai) (3.7.4.3)\n",
      "Requirement already satisfied: six>=1.5 in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas>=1.2.3->openai) (1.15.0)\n",
      "Requirement already satisfied: idna<3,>=2.5 in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from requests>=2.20->openai) (2.10)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from requests>=2.20->openai) (2020.12.5)\n",
      "Requirement already satisfied: chardet<5,>=3.0.2 in /Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages (from requests>=2.20->openai) (4.0.0)\n",
      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/bot/.local/lib/python3.7/site-packages (from requests>=2.20->openai) (1.25.11)\n",
      "\u001b[33mWARNING: You are using pip version 21.1.3; however, version 21.2.4 is available.\n",
      "You should consider upgrading via the '/Users/bot/.pyenv/versions/3.7.4/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install openai"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6dbfacf5-67be-4de8-a5c6-0a64ecb5d773",
   "metadata": {},
   "source": [
    "### Download dataset\n",
    "E-commerce product review dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfb49045-b17c-4b3d-87ee-ff45360fb031",
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget https://raw.githubusercontent.com/b2wdigital/b2w-reviews01/main/B2W-Reviews01.csv -P data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "39f8f39c-2e75-4a88-9d11-fd5190c9fa0f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_OPENAI_API_KEY_HERE\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21bf42ae-5107-4b52-8488-3fb0dd578548",
   "metadata": {},
   "source": [
    "### Import packages "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ac09c38d-6876-42d3-a6fd-8ba0a0934841",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from AutoLabeling import AutoLabeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "22c68436-46b4-4655-83f9-8e334c874913",
   "metadata": {},
   "outputs": [],
   "source": [
    "random_state = 21"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d794425-db83-4aa7-a3b1-b5a448d533c5",
   "metadata": {},
   "source": [
    "### Load dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d2dba7c6-23c1-4133-9b8d-c3d5e8741af2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>recommend_to_a_friend</th>\n",
       "      <th>review_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Yes</td>\n",
       "      <td>Estou contente com a compra entrega rápida o ú...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Yes</td>\n",
       "      <td>Por apenas R$1994.20,eu consegui comprar esse ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Yes</td>\n",
       "      <td>SUPERA EM AGILIDADE E PRATICIDADE OUTRAS PANEL...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Yes</td>\n",
       "      <td>MEU FILHO AMOU! PARECE DE VERDADE COM TANTOS D...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Yes</td>\n",
       "      <td>A entrega foi no prazo, as americanas estão de...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  recommend_to_a_friend                                        review_text\n",
       "0                   Yes  Estou contente com a compra entrega rápida o ú...\n",
       "1                   Yes  Por apenas R$1994.20,eu consegui comprar esse ...\n",
       "2                   Yes  SUPERA EM AGILIDADE E PRATICIDADE OUTRAS PANEL...\n",
       "3                   Yes  MEU FILHO AMOU! PARECE DE VERDADE COM TANTOS D...\n",
       "4                   Yes  A entrega foi no prazo, as americanas estão de..."
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"data/B2W-Reviews01.csv\", usecols=[\"review_text\",\"recommend_to_a_friend\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "563d0401-1501-4bf7-aafc-3ecc53c9adff",
   "metadata": {},
   "source": [
    "### Define the label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0bbd332d-3ecd-4c1d-91d0-ba029ac2679e",
   "metadata": {},
   "outputs": [],
   "source": [
    "no_df = df[df[\"recommend_to_a_friend\"] == \"No\"].sample(n=500, random_state=random_state)\n",
    "yes_df = df[df[\"recommend_to_a_friend\"] == \"Yes\"].sample(n=500, random_state=random_state)\n",
    "\n",
    "label_df = pd.concat([no_df, yes_df])\n",
    "\n",
    "label_df[\"sentiment\"] = label_df[\"recommend_to_a_friend\"].apply(lambda x: \"positivo\" if str(x).lower() == \"yes\" else \"negativo\")\n",
    "\n",
    "cols = [\"sentiment\",  \"review_text\"]\n",
    "label_df[cols].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b6bfec54-4e57-4554-afab-664d68bcfb30",
   "metadata": {},
   "source": [
    "### Init the AutoLabeling class with few examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "5ddb091d-ea24-46a1-bee4-16697c038694",
   "metadata": {},
   "outputs": [],
   "source": [
    "few_shot_df = label_df.sample(n=10)\n",
    "\n",
    "labeling = AutoLabeling(label_df, text_col=\"review_text\", label_col=\"sentiment\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9b2e76d1-6262-466c-a6f4-ab017599dea9",
   "metadata": {},
   "source": [
    "### Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "259cd67b-a992-482e-8c46-46641527e9f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "labeling.execute(\"nao gostei da qualidade do produto\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "cfa5609f-c408-42e8-a288-23d41827dfb8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'positivo'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labeling.execute(\"adorei a qualidade do produto\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d99bb09f-c368-4eb8-a2ba-707f586510dc",
   "metadata": {},
   "source": [
    "### Apply to dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4964c582-1146-471c-82eb-a7af2544b031",
   "metadata": {},
   "outputs": [],
   "source": [
    "label_df[\"labeling\"] = label_df[\"review_text\"].apply(labeling.execute)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c2c1464c-3768-4629-b7c6-979ca775c364",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>recommend_to_a_friend</th>\n",
       "      <th>review_text</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>labeling</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>113141</th>\n",
       "      <td>No</td>\n",
       "      <td>Entrega estava prevista para 7 dias e chegou e...</td>\n",
       "      <td>negativo</td>\n",
       "      <td>negativo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99572</th>\n",
       "      <td>No</td>\n",
       "      <td>o produto era pra ter chegado a dois dias send...</td>\n",
       "      <td>negativo</td>\n",
       "      <td>negativo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38618</th>\n",
       "      <td>No</td>\n",
       "      <td>Recebi o produto dentro do prazo estabelecido,...</td>\n",
       "      <td>negativo</td>\n",
       "      <td>negativo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87421</th>\n",
       "      <td>No</td>\n",
       "      <td>Nao gostei da qualidade do produto, veio falta...</td>\n",
       "      <td>negativo</td>\n",
       "      <td>negativo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>123565</th>\n",
       "      <td>No</td>\n",
       "      <td>Comprei e nao recebi produlto ainda adiarao a ...</td>\n",
       "      <td>negativo</td>\n",
       "      <td>negativo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>88774</th>\n",
       "      <td>Yes</td>\n",
       "      <td>adorei a minha compra,chegou na data certa, fo...</td>\n",
       "      <td>positivo</td>\n",
       "      <td>positivo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>88869</th>\n",
       "      <td>Yes</td>\n",
       "      <td>Só não comprei pois tenho que esperar de 11 a ...</td>\n",
       "      <td>positivo</td>\n",
       "      <td>negativo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57841</th>\n",
       "      <td>Yes</td>\n",
       "      <td>gostei muito do aspirador,assim que recebi o p...</td>\n",
       "      <td>positivo</td>\n",
       "      <td>positivo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18590</th>\n",
       "      <td>Yes</td>\n",
       "      <td>Produto bom atendeu minhas expectativas, fone ...</td>\n",
       "      <td>positivo</td>\n",
       "      <td>negativo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97355</th>\n",
       "      <td>Yes</td>\n",
       "      <td>Produto veio em embalagem diferente do site, f...</td>\n",
       "      <td>positivo</td>\n",
       "      <td>negativo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       recommend_to_a_friend  \\\n",
       "113141                    No   \n",
       "99572                     No   \n",
       "38618                     No   \n",
       "87421                     No   \n",
       "123565                    No   \n",
       "...                      ...   \n",
       "88774                    Yes   \n",
       "88869                    Yes   \n",
       "57841                    Yes   \n",
       "18590                    Yes   \n",
       "97355                    Yes   \n",
       "\n",
       "                                              review_text sentiment  labeling  \n",
       "113141  Entrega estava prevista para 7 dias e chegou e...  negativo  negativo  \n",
       "99572   o produto era pra ter chegado a dois dias send...  negativo  negativo  \n",
       "38618   Recebi o produto dentro do prazo estabelecido,...  negativo  negativo  \n",
       "87421   Nao gostei da qualidade do produto, veio falta...  negativo  negativo  \n",
       "123565  Comprei e nao recebi produlto ainda adiarao a ...  negativo  negativo  \n",
       "...                                                   ...       ...       ...  \n",
       "88774   adorei a minha compra,chegou na data certa, fo...  positivo  positivo  \n",
       "88869   Só não comprei pois tenho que esperar de 11 a ...  positivo  negativo  \n",
       "57841   gostei muito do aspirador,assim que recebi o p...  positivo  positivo  \n",
       "18590   Produto bom atendeu minhas expectativas, fone ...  positivo  negativo  \n",
       "97355   Produto veio em embalagem diferente do site, f...  positivo  negativo  \n",
       "\n",
       "[1000 rows x 4 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3375cad0-6e38-45b3-8e0a-f4f40c87ec24",
   "metadata": {},
   "source": [
    "### Evaluate labeling performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "45b68096-6328-49c7-b3fe-fdf191095e1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, average_precision_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "146bbd1c-3af3-468e-a4f4-7a01e284f596",
   "metadata": {},
   "outputs": [],
   "source": [
    "mapping = {'negativo': 0, 'positivo': 1}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "7504c561-e83e-4515-bd39-b0de5b8ec121",
   "metadata": {},
   "outputs": [],
   "source": [
    "label_df[\"true\"] = label_df[\"sentiment\"].map(mapping)\n",
    "label_df[\"pred\"] = label_df[\"labeling\"].map(mapping)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3723eab4-7c8b-4cb4-a9fd-bbdd9c90eace",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.837"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "roc_auc_score(label_df[\"true\"].values, label_df[\"pred\"].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "692e505c-2056-4770-a542-6fb746c4a80f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8223875338753387"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "average_precision_score(label_df[\"true\"].values, label_df[\"pred\"].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "29daae51-1964-4bd8-8f8e-ed576842cd5f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.77      0.97      0.86       500\n",
      "           1       0.96      0.71      0.81       500\n",
      "\n",
      "    accuracy                           0.84      1000\n",
      "   macro avg       0.86      0.84      0.83      1000\n",
      "weighted avg       0.86      0.84      0.83      1000\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\n",
    "    classification_report(label_df[\"true\"].values, label_df[\"pred\"].values)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60fb588b-5be9-49d7-8911-8ee6d9ae2cf0",
   "metadata": {},
   "source": [
    "### Error analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "57709a7d-4085-4f29-88b0-673153368916",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(163, 6)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_df[label_df[\"true\"] != label_df[\"pred\"]].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "b10aa303-f237-4d40-a9c8-08904f43f038",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "113308 | text: O antiaderente cerâmico é muito superior ao teflon.\n",
      "Label: negativo | labeling: positivo | recommend: No\n",
      "\n",
      "32207 | text: Eu recomendo vale a pena , pois mantém o café quente sem precisar por na garrafa térmica.\n",
      "Label: positivo | labeling: negativo | recommend: Yes\n",
      "\n",
      "101232 | text: Ainda não recebi o produto , fiz a compra a quase um mês e não foi entregue ainda pela  transportadora. Esperando receber para avaliar.\n",
      "Label: positivo | labeling: negativo | recommend: Yes\n",
      "\n",
      "89237 | text: O produto  muito bom , meu filho adorou  Oi preta também vou vai gostar Eu recomendo,material bom , e bem grande o espaço\n",
      "Label: positivo | labeling: negativo | recommend: Yes\n",
      "\n",
      "24187 | text: Recomendo o produto e também o site para a compra, o mesmo foi entregue antes do prazo previsto, muito bom.\n",
      "Label: negativo | labeling: positivo | recommend: No\n",
      "\n",
      "21070 | text: Bom rapidez na entrega..,............mas na entrega o entregador não solicitou nenhum documento tá pessoa que recebeu só perguntou o nome .\n",
      "Label: positivo | labeling: negativo | recommend: Yes\n",
      "\n",
      "97355 | text: Produto veio em embalagem diferente do site, fiquei em duvida se era o mesmo produto, funcionamento do produto está ok.\n",
      "Label: positivo | labeling: negativo | recommend: Yes\n",
      "\n",
      "60950 | text: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n",
      "Label: positivo | labeling: negativo | recommend: Yes\n",
      "\n",
      "83127 | text: Produto bom, nada muito especial visto que o preço é alto. Houve um problema grave na entrega, 20 dias de ATRASO é muita coisa, achei desrespeitoso, um descaso.\n",
      "Label: positivo | labeling: negativo | recommend: Yes\n",
      "\n",
      "98171 | text: Trabalho com livros do professor há tempos. Esperava que esse fosse como outros, que tem os exercícios e as respostas... porém só vem as respostas. Em sala de aula sigo usando o do aluno... esse acabei deixando em casa, só para o planejamento de aula, caso tenha alguma dúvida na resposta de um exercício.\n",
      "Label: positivo | labeling: negativo | recommend: Yes\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for idx, row in label_df[label_df[\"true\"] != label_df[\"pred\"]].sample(n=10).iterrows():\n",
    "    print(f\"\"\"{idx} | text: {row[\"review_text\"]}\\nLabel: {row[\"sentiment\"]} | labeling: {row[\"labeling\"]} | recommend: {row[\"recommend_to_a_friend\"]}\"\"\")\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "5ee95a0a-7c8f-4440-9263-cb18578e8656",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from lightgbm import LGBMClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "923de8a4-23cc-4d05-bb90-4a58963e00ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "word_vectorizer = TfidfVectorizer(sublinear_tf=True,\n",
    "                                  strip_accents='unicode',\n",
    "                                  analyzer='word',\n",
    "                                  max_features=10000)\n",
    "\n",
    "classifier = RandomForestClassifier()\n",
    "     \n",
    "pipeline = Pipeline([\n",
    "    ('vect', word_vectorizer),\n",
    "    ('clf', classifier),\n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42f28e53-896e-470b-8c66-801375c91a9c",
   "metadata": {},
   "source": [
    "### Train on original label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "75cfe080-94cf-47b4-acaa-02a2450a94aa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(768,) (768,) (192,) (192,)\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "    negativo       0.83      0.79      0.81        96\n",
      "    positivo       0.80      0.83      0.82        96\n",
      "\n",
      "    accuracy                           0.81       192\n",
      "   macro avg       0.81      0.81      0.81       192\n",
      "weighted avg       0.81      0.81      0.81       192\n",
      "\n"
     ]
    }
   ],
   "source": [
    "label_df_ = label_df.dropna(subset=[\"review_text\"])\n",
    "X_train, X_test, y_train, y_test = train_test_split(label_df_[\"review_text\"].values, label_df_[\"sentiment\"].values, test_size=0.2)\n",
    "\n",
    "print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)\n",
    "\n",
    "pipeline.fit(X_train, y_train)\n",
    "\n",
    "y_pred = pipeline.predict(X_test)\n",
    "\n",
    "print(\n",
    "    classification_report(y_test, y_pred)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60891687-dcb9-405f-976f-858eb07a6684",
   "metadata": {},
   "source": [
    "### Train on generated label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "9d946e7d-fa0a-4c47-9885-64ad027d5bb6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(768,) (768,) (192,) (192,)\n",
      "                                   precision    recall  f1-score   support\n",
      "\n",
      "aticamente resolver o problema. n       0.00      0.00      0.00         1\n",
      "                         negativo       0.75      0.90      0.82       107\n",
      "                         positivo       0.83      0.63      0.72        84\n",
      "\n",
      "                         accuracy                           0.78       192\n",
      "                        macro avg       0.53      0.51      0.51       192\n",
      "                     weighted avg       0.78      0.78      0.77       192\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n",
      "/Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n",
      "/Users/bot/.pyenv/versions/3.7.4/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n"
     ]
    }
   ],
   "source": [
    "label_df_ = label_df.dropna(subset=[\"review_text\"])\n",
    "X_train, X_test, y_train, y_test = train_test_split(label_df_[\"review_text\"].values, label_df_[\"labeling\"].values, test_size=0.2)\n",
    "\n",
    "print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)\n",
    "\n",
    "pipeline.fit(X_train, y_train)\n",
    "\n",
    "y_pred = pipeline.predict(X_test)\n",
    "\n",
    "print(\n",
    "    classification_report(y_test, y_pred)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dfb80edc-e57a-4ea1-b4b0-3fe4ede160eb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
